{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Gene Ontology Enrichment Analysis (GOEA)\n",
    "This is the same GOEA as in goea_nbt3102.ipynb, but the GOEA results can be obtained by calling a single function.\n",
    "\n",
    "We use data from a 2014 Nature paper:    \n",
    "[Computational analysis of cell-to-cell heterogeneity\n",
    "in single-cell RNA-sequencing data reveals hidden \n",
    "subpopulations of cells\n",
    "](http://www.nature.com/nbt/journal/v33/n2/full/nbt.3102.html#methods)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def get_goeaobj_nbt3102(method='fdr_bh'):\n",
    "    \"\"\"Return GOEA Object ready to run Nature data.\"\"\"\n",
    "    from goatools.obo_parser import GODag\n",
    "    from goatools.associations import read_ncbi_gene2go\n",
    "    from goatools.test_data.genes_NCBI_10090_ProteinCoding import GeneID2nt as GeneID2nt_mus\n",
    "    from goatools.go_enrichment import GOEnrichmentStudy\n",
    "    from goatools.base import download_go_basic_obo, download_ncbi_associations\n",
    "    # Load Ontologies\n",
    "    obo_fname = download_go_basic_obo()\n",
    "    obodag = GODag(\"go-basic.obo\")\n",
    "    # Load Associations\n",
    "    download_ncbi_associations() # Get ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene2go.gz\n",
    "    geneid2gos_mouse = read_ncbi_gene2go(\"gene2go\", taxids=[10090])\n",
    "    # GOE Object holds Ontologies, Associations, and Background gene set\n",
    "    return GOEnrichmentStudy(\n",
    "        GeneID2nt_mus.keys(), # Background gene set: mouse protein-coding genes\n",
    "        geneid2gos_mouse, # geneid/GO Associations\n",
    "        obodag, # Ontologies\n",
    "        propagate_counts = False,\n",
    "        alpha = 0.05, # default significance cut-off\n",
    "        methods = [method]) # defult multipletest correction method"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def read_data_nbt3102():\n",
    "    \"\"\"Read data from Nature paper.\"\"\"\n",
    "    import os\n",
    "    # Data will be stored in this variable\n",
    "    geneid2symbol = {}\n",
    "    # Get xlsx filename where data is stored\n",
    "    ROOT = os.path.dirname(os.getcwd()) # go up 1 level from current working directory\n",
    "    din_xlsx = os.path.join(ROOT, \"tests/data/nbt_3102/nbt.3102-S4_GeneIDs.xlsx\")\n",
    "    # Read data\n",
    "    if os.path.isfile(din_xlsx):  \n",
    "        import xlrd\n",
    "        book = xlrd.open_workbook(din_xlsx)\n",
    "        pg = book.sheet_by_index(0)\n",
    "        for r in range(pg.nrows):\n",
    "            symbol, geneid, pval = [pg.cell_value(r, c) for c in range(pg.ncols)]\n",
    "            if geneid:\n",
    "                geneid2symbol[int(geneid)] = symbol\n",
    "    return geneid2symbol"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
